## stm.R
## CLIMDO
## Mikael Poul Johannesson

## Start matter ------------------------------------------------------------------------------------

set.seed(123)

pkgs <- c("dplyr", "tidyr", "haven", "xlsx", "stm", "foreach", "iterators", "doParallel", "parallel")
for (pkg in pkgs) library(pkg, character.only = TRUE, verbose = FALSE)

##registerDoParallel(makeCluster(20))
registerDoParallel(makeCluster(3))

## Load functions ---------------------------------------------------------------------------------

source("stm_plot_functions.R")  # Includes functions for plotting stm models.

run_stm_models <- function(topics, ...) {
    '%op%' <- ifelse(getDoParWorkers() > 1, foreach::'%dopar%', foreach::'%do%')
    models <- foreach(top = topics, .packages = "stm", .inorder = TRUE) %op% selectModel(K = top, ...)
    names(models) <- paste0("K", topics)
    return(models)
}

## Import and prepare corpus ----------------------------------------------------------------------

ncp <- read.csv("data/climdo-data.csv")

nrow(ncp[!is.na(ncp$text), ])

ncp <- ncp %>%
    filter(!is.na(age) & !is.na(gender) & !is.na(education) & !is.na(political_interest) & !is.na(not_allow_love))

text <- ncp$text %>%
    tolower() %>%
    gsub("[^A-Za-zæøåÆØÅäÄöÖ_-]", " ", .) %>%
    zap_empty()

## corrections <- "data/substitutions-ET.xlsx" %>%  #  Manual spelling/stemming corrections
##   readxl::read_excel() %>%
##   mutate(from = tolower(from)) %>%
##   filter(!is.na(from))

corrections <- readxl::read_excel("data/manual_corrections.xlsx") %>%
  mutate(from = tolower(from)) %>%
  filter(!is.na(from))

text_alt1 <- text
text_alt2 <- text
for (i in 1:nrow(corrections)) {
  text_alt1 <- gsub(paste0(" ", corrections$from[i], " "),
               paste0(" ", corrections$splittET[i], " "),
               text_alt1)
  text_alt2 <- gsub(paste0(corrections$from[i]), paste0(" ", corrections$splittET[i], " "), text_alt2)
}

text_alt3 <- lapply(tokenizers::tokenize_words(text), function(x) {
  lapply(x, function(y) {
    sapply(nrow(corrections), function(z) {
      gsub(corrections$from[z], corrections$splittET[z], y)
    })
  })
})

text_alt3 <- sapply(text_alt3, paste, collapse = " ")

text <- text %>%
    gsub("\\s+", " ", .) %>%
    gsub("^\\s+|\\s+$", "", .) %>%  # Remove trailing/leading white space.
    zap_empty()

## comparison <- lapply(c("text", "text_alt1", "text_alt2", "text_alt3"), function(x) {
##   data <- get(x)
##   proc <- textProcessor(data, metadata = ncp, language = "norwegian", verbose = FALSE)
##   message(x)
##   out <- data_frame(
##     "id" = ncp$id[-c(proc$docs.removed)],
##     "correct" = data[-c(proc$docs.removed)],
##     "processed" = sapply(lapply(proc$documents, function(x) paste0(proc$vocab[x[1, ]], "(x", x[2, ], ")")),
##                          paste, collapse = " ")
##   )
##   names(out) <- c("id", paste0(x, "_", c("text", "processed")))
##   return(out)
## })

## comparison <- comparison[[1]] %>%
##   full_join(comparison[[2]], by = "id") %>%
##   full_join(comparison[[3]], by = "id") %>%
##   full_join(comparison[[4]], by = "id")

## xlsx::write.xlsx(as.data.frame(comparison), file = "output/texts_comparison.xlsx", row.names = FALSE)

## Preprocess and prep.  ------------------------------------------------------------------------------

processed <- textProcessor(text, metadata = ncp, language = "norwegian", verbose = FALSE)
out <- prepDocuments(processed$documents, processed$vocab, processed$meta, lower.thresh = 5, verbose = FALSE)
raw_text <- ncp$text[-c(processed$docs.removed)][-c(out$docs.removed)]
save(raw_text, text, processed, out, file = "output/prepared_corpus.RData")

## Write processed texts to file  ---------------------------------------------------------------------

for (output in c("pdf", "docx")) {
    texts_to_document(raw_text, out$meta,  # Writes all answer to a text document (from stm_plot_functions.R).
                      "output/climdo_texts_metadata", output,
                      "CLIMDO Project: All open-ended answers (with meta data)",
                      add = c("age", "gender", "education", "interest"))
    texts_to_document(raw_text, meta = NULL,
                      "output/climdo_texts", output,
                      "CLIMDO Project: All open-ended answers")
}

all_texts <- data.frame("original" = ncp$text[-c(processed$docs.removed)],
                        "manually_corrected" = text[-c(processed$docs.removed)],
                        "processed" = sapply(lapply(processed$documents, function(x) paste0(processed$vocab[x[1, ]], "(x", x[2, ], ")")),
                                             paste, collapse = " "),
                        stringsAsFactors = FALSE)

xlsx::write.xlsx(all_texts, file = "output/climdo_texts_comparison.xlsx", row.names = FALSE)

## Run stm --------------------------------------------------------------------------------------------

stm_models <- run_stm_models(topics = 3:12,
                             documents = out$documents,
                             vocab = out$vocab,
                             prevalence = ~ age + gender + education + political_interest,
                             data = out$meta,
                             runs = 100,
                             N = 4,
                             max.em.its = 500,
                             seed = 123)

##save(stm_models, file = "output/stm_models.RData")
load("output/stm_models.RData")
load("output/prepared_corpus.RData")

## Plot models -----------------------------------------------------------------------------------------

description <- "This is part of the 'CLIMDO' project. Respondents in the Norwegian Citizen Panel were asked 'When it comes to climate change, what do you think should be done?' as an open-ended question. About one fourth of the respondent were asked in wave 4 (spring 2015) and the rest in wave 5 (autumn 2015)."

plot_stm_models(stm_models, raw_text, processed, out,  # See stm_plot_functions.R
                path = "output/climdo_stm_v6.pdf",
                author = "Mikael Johannesson (mikael.johannesson@uib.no)",
                version = "V6",
                project = "CLIMDO",
                description = description)
